import os
import time

from pyspark import *

if __name__ == '__main__':
    conf = SparkConf().setAppName("HelloWorld") \
            .setMaster("yarn")
        # .setMaster("local[*]")

    sc = SparkContext(conf=conf)
    # checkpoint 不支持内存
    sc.setCheckpointDir("hdfs://hadoop3cluster/user/output/checkpoint")
    print(os.getcwd())
    print(os.listdir(os.getcwd()))
    print(os.path.dirname(os.getcwd()))
    print(os.path.dirname(os.path.dirname(os.getcwd())))

    localhost_path = 'file://' + os.path.dirname(os.getcwd()) + '/data/00_example_HelloWorld.txt'
    localhost_path = "hdfs://hadoop3cluster/updown/WordCountInput.txt"
    # result=sc.textFile("hdfs://hadoop3cluster/updown/WordCountInput.txt")\
    rdd1 = sc.textFile(localhost_path)
    rdd2 = rdd1.flatMap(lambda t: t.split(" "))
    rdd3=rdd2.map(lambda t: (t, 1))
    ### rdd3.cache() = rdd3.persist(StorageLevel.MEMORY_ONLY)
    # rdd3.persist(StorageLevel.MEMORY_AND_DISK_2)
    rdd3.checkpoint()

    rdd4=rdd3.reduceByKey(lambda a, b: a + b)
    print(rdd4.collect())

    rdd5=rdd3.groupByKey()
    # print(rdd5.mapValues(list).collect())
    rdd6=rdd5.mapValues(sum)
    print(rdd6.collect())
    rdd3.unpersist()

    time.sleep(1000000)
